library(tidyverse)
library(knitr)
library(zoo)Team Project Proposal
For this document, the following packages are required:
Original Data Visualization in Climate Change
The ongoing dialogue surrounding climate change has increasingly focused on the empirical data that underscores global temperature trends. NOAA’s recent visualization of global temperature anomalies from 1880 to 2023 offers a comprehensive view of the planet’s warming trajectory. This project aims to elucidate the correlation between anthropogenic activities and climate change, highlighting how industrialization and carbon emissions have influenced global temperatures.
The visualization spans over a century of data, capturing periods of significant climatic shifts. Despite its clarity in depicting long-term trends, the inclusion of interactive features (as illustrated in Figure 1) enhances user engagement with the data. However, there is potential for further refinement. Incorporating more dynamic elements such as temporal sliders, geospatial overlays, and chloropleth map integration could offer a more nuanced understanding of how specific policies and global events impact climate change on a regional and global scale.
Critical Assessment of the Original Visualization
Proposed Improvements
Choropleth Map Integration
Data Cleaning
# Load the data
data <- read_csv("data/country_temp.csv")
# Convert the date column to Date format and extract the year and month
data$dt <- as.Date(data$dt, format = "%Y-%m-%d")
data$year <- as.numeric(format(data$dt, "%Y"))
data$month <- format(data$dt, "%m")
# Group by country, year, and month, then calculate the average temperatures rounded to 2 decimal points
monthly_avg_all_countries_initial <- data |>
group_by(Country, year, month) |>
summarize(monthly_avg = round(mean(AverageTemperature, na.rm = TRUE), 2))
# Display the tail of the initial data frame
knitr::kable(tail(monthly_avg_all_countries_initial), caption = "Tail of Monthly Average Temperatures by Country", digits = 2)| Country | year | month | monthly_avg |
|---|---|---|---|
| Åland | 2013 | 04 | 1.70 |
| Åland | 2013 | 05 | 10.33 |
| Åland | 2013 | 06 | 14.07 |
| Åland | 2013 | 07 | 16.45 |
| Åland | 2013 | 08 | 16.42 |
| Åland | 2013 | 09 | NaN |
# Filter the data for Afghanistan in the year 1839
afghanistan_1839_initial <- monthly_avg_all_countries_initial |>
filter(Country == "Afghanistan" & year == 1839)
# Display the filtered data
print(afghanistan_1839_initial)# A tibble: 12 × 4
# Groups: Country, year [1]
Country year month monthly_avg
<chr> <dbl> <chr> <dbl>
1 Afghanistan 1839 01 NaN
2 Afghanistan 1839 02 NaN
3 Afghanistan 1839 03 NaN
4 Afghanistan 1839 04 NaN
5 Afghanistan 1839 05 NaN
6 Afghanistan 1839 06 NaN
7 Afghanistan 1839 07 NaN
8 Afghanistan 1839 08 NaN
9 Afghanistan 1839 09 NaN
10 Afghanistan 1839 10 NaN
11 Afghanistan 1839 11 NaN
12 Afghanistan 1839 12 NaN
# Check for NA and NaN values in the monthly_avg_all_countries_initial dataframe
nan_check_initial <- sapply(monthly_avg_all_countries_initial, function(x) sum(is.na(x)))
print(nan_check_initial) Country year month monthly_avg
0 0 0 32651
# Display rows with NA and NaN values
rows_with_nan_initial <- monthly_avg_all_countries_initial[apply(monthly_avg_all_countries_initial, 1, function(row) any(is.na(row))), ]
print(rows_with_nan_initial)# A tibble: 32,651 × 4
# Groups: Country, year [3,860]
Country year month monthly_avg
<chr> <dbl> <chr> <dbl>
1 Afghanistan 1838 05 NaN
2 Afghanistan 1838 12 NaN
3 Afghanistan 1839 01 NaN
4 Afghanistan 1839 02 NaN
5 Afghanistan 1839 03 NaN
6 Afghanistan 1839 04 NaN
7 Afghanistan 1839 05 NaN
8 Afghanistan 1839 06 NaN
9 Afghanistan 1839 07 NaN
10 Afghanistan 1839 08 NaN
# ℹ 32,641 more rows
# Remove rows with "Antarctica" from the dataset as the entire dataset for it is NaN
monthly_avg_all_countries_initial <- monthly_avg_all_countries_initial |> filter(Country != "Antarctica")
# Verify that "Antarctica" rows are removed
print(monthly_avg_all_countries_initial |> filter(Country == "Antarctica"))# A tibble: 0 × 4
# Groups: Country, year [0]
# ℹ 4 variables: Country <chr>, year <dbl>, month <chr>, monthly_avg <dbl>
# Function to fill NaN values using zoo library
# It is used for filling missing values through linear interpolation and forward/backward filling, ensuring continuous and complete temperature records.
fill_nan <- function(df) {
df |>
group_by(Country) |>
mutate(monthly_avg = na.approx(monthly_avg, na.rm = FALSE, rule = 2)) |>
mutate(monthly_avg = ifelse(is.na(monthly_avg), zoo::na.locf(monthly_avg, na.rm = FALSE), monthly_avg)) |>
mutate(monthly_avg = ifelse(is.na(monthly_avg), zoo::na.locf(monthly_avg, fromLast = TRUE, na.rm = FALSE), monthly_avg)) |>
ungroup()
}
# Apply fill_nan function
monthly_avg_all_countries_filled <- fill_nan(monthly_avg_all_countries_initial)
# Function to fill remaining NaN values with median of 5-year window for the same month
fill_with_window_month_median <- function(df) {
df |>
group_by(Country, month) |>
mutate(monthly_avg = ifelse(is.na(monthly_avg),
sapply(seq_along(monthly_avg), function(i) {
if (is.na(monthly_avg[i])) {
start_year <- year[i] - 5
end_year <- year[i] + 5
window_median <- median(df$monthly_avg[df$Country == Country[i] & df$month == month[i] & df$year >= start_year &df$year <= end_year & !is.na(df$monthly_avg)], na.rm = TRUE)
return(window_median)
} else {
return(monthly_avg[i])
}
}),
monthly_avg)) |>
ungroup()
}
# Apply window month median filling function
monthly_avg_all_countries_median <- fill_with_window_month_median(monthly_avg_all_countries_filled)
# Round the final dataframe to 2 decimal places
monthly_avg_all_countries_median <- monthly_avg_all_countries_median |>
mutate(monthly_avg = round(monthly_avg, 2))
# Checking for remaining NaN values in the median imputed dataset
nan_check_median <- sapply(monthly_avg_all_countries_median, function(x) sum(is.na(x)))
print(nan_check_median) Country year month monthly_avg
0 0 0 0
# Display rows with NaN values in the median imputed dataset
rows_with_nan_median <- monthly_avg_all_countries_median[apply(monthly_avg_all_countries_median, 1, function(row) any(is.na(row))), ]
print(rows_with_nan_median)# A tibble: 0 × 4
# ℹ 4 variables: Country <chr>, year <dbl>, month <chr>, monthly_avg <dbl>
# Filter the data for Afghanistan in the year 1839
afghanistan_1839_median <- monthly_avg_all_countries_median |>
filter(Country == "Afghanistan" & year == 1839)
# Display the filtered data
print(afghanistan_1839_median)# A tibble: 12 × 4
Country year month monthly_avg
<chr> <dbl> <chr> <dbl>
1 Afghanistan 1839 01 6.51
2 Afghanistan 1839 02 6.03
3 Afghanistan 1839 03 5.55
4 Afghanistan 1839 04 5.07
5 Afghanistan 1839 05 4.59
6 Afghanistan 1839 06 4.11
7 Afghanistan 1839 07 3.62
8 Afghanistan 1839 08 3.14
9 Afghanistan 1839 09 2.66
10 Afghanistan 1839 10 2.18
11 Afghanistan 1839 11 1.7
12 Afghanistan 1839 12 1.22
Joining with Country Codes
country_codes <- read_csv("data/country_codes.csv")
country_codes$Country <- gsub("\\s*\\([^)]*\\)|\\s*\\[[^]]*\\]|\\s+Islands", "", country_codes$Country)
country_codes$Country <- gsub("Russian Federation", "Russia", country_codes$Country)
country_codes$Country <- gsub("United States of America", "United States", country_codes$Country)
country_codes$Country <- gsub("United Kingdom of Great Britain and Northern Ireland", "United Kingdom", country_codes$Country)
country_codes$Country <- gsub("Viet Nam", "Vietnam", country_codes$Country)
country_codes$Country <- gsub("Bonaire, Sint Eustatius and Saba", "Bonaire, Saint Eustatius And Saba", country_codes$Country)
country_codes$Country <- gsub("Myanmar", "Burma", country_codes$Country)
country_codes$Country <- gsub("Cabo Verde", "Cape Verde", country_codes$Country)
country_codes$Country <- gsub("Czechia", "Czech Republic", country_codes$Country)
country_codes$Country <- gsub("Côte d'Ivoire", "Côte D'Ivoire", country_codes$Country)
country_codes$Country <- gsub("Micronesia", "Federated States Of Micronesia", country_codes$Country)
country_codes$Country <- gsub("French Southern Territories", "French Southern And Antarctic Lands", country_codes$Country)
country_codes$Country <- gsub("Guinea-Bissau", "Guinea Bissau", country_codes$Country)
country_codes$Country <- gsub("Heard Island and McDonald", "Heard Island And Mcdonald", country_codes$Country)
country_codes$Country <- gsub("Isle of Man", "Isle Of Man", country_codes$Country)
country_codes$Country <- gsub("Macao", "Macau", country_codes$Country)
country_codes$Country <- gsub("Republic of North Macedonia", "Macedonia", country_codes$Country)
country_codes$Country <- gsub("Palestine, State of", "Palestina", country_codes$Country)
country_codes$Country <- gsub("Saint Vincent and The Grenadines", "Saint Vincent And The Grenadines", country_codes$Country)
country_codes$Country <- gsub("South Georgia And The South Sandwich", "South Georgia And The South Sandwich Isla", country_codes$Country)
country_codes$Country <- gsub("Syrian Arab Republic", "Syria", country_codes$Country)
country_codes$Country <- gsub("Tanzania, United Republic of", "Tanzania", country_codes$Country)
country_codes$Country <- gsub("Timor-Leste", "Timor Leste", country_codes$Country)
country_codes$Country <- gsub(" and ", " And ", country_codes$Country)
monthly_avg_all_countries_median$Country <- gsub("\\s*\\([^)]*\\)|\\s*\\[[^]]*\\]|\\s+Islands", "",monthly_avg_all_countries_median$Country)
joinedDT <-
full_join(
monthly_avg_all_countries_median,
country_codes,
by = c("Country")
)
missing_data <- joinedDT %>% filter(is.na(joinedDT$`Alpha2`))
unique_countries <- missing_data %>%
distinct(Country, .keep_all = TRUE)
# Print the unique countries
# Display the tail of the initial data frame
knitr::kable(unique_countries, caption = "List of Unique Countries with no Country Codes")| Country | year | month | monthly_avg | Alpha2 | Alpha3 | Numeric |
|---|---|---|---|---|---|---|
| Africa | 1850 | 01 | 19.76 | NA | NA | NA |
| Asia | 1816 | 04 | 6.98 | NA | NA | NA |
| Baker Island | 1825 | 01 | 25.58 | NA | NA | NA |
| British Virgin | 1824 | 01 | 24.79 | NA | NA | NA |
| Europe | 1743 | 11 | 3.94 | NA | NA | NA |
| Gaza Strip | 1808 | 10 | 21.31 | NA | NA | NA |
| Kingman Reef | 1883 | 01 | 25.65 | NA | NA | NA |
| Laos | 1816 | 03 | 21.65 | NA | NA | NA |
| Namibia | 1857 | 01 | 23.05 | NA | NAM | 516 |
| North America | 1768 | 09 | 9.76 | NA | NA | NA |
| Oceania | 1852 | 07 | 14.44 | NA | NA | NA |
| Palmyra Atoll | 1883 | 01 | 25.66 | NA | NA | NA |
| Reunion | 1787 | 01 | 26.11 | NA | NA | NA |
| Saint Vincent And The Grenadines | 1824 | 01 | 25.55 | NA | NA | NA |
| South America | 1851 | 01 | 23.64 | NA | NA | NA |
| South Georgia And The South Sandwich Isla | 1874 | 12 | 2.94 | NA | NA | NA |
| Swaziland | 1857 | 01 | 21.63 | NA | NA | NA |
| Turks And Caicas | 1823 | 01 | 23.82 | NA | NA | NA |
countries_cleaned <- joinedDT[!is.na(joinedDT$Alpha3), ]
# Check the cleaned data
library(DT)
datatable(countries_cleaned, caption = "Cleaned Data Joined with Country Codes")